Heatmap function
create_gene_heatmap <- function(de_results_df, dge_object, font_size = 12, num_genes = 20) {
# Subset the genes
top_genes <- de_results_df$Symbol[1:num_genes]
subset_matrix <- dge_object$logCPM[top_genes, colnames(dge_object$counts)]
# Create annotation dataframe
sample_info <- dge_object$samples
annotation_df <- data.frame(
Dose = sample_info$Dose
)
rownames(annotation_df) <- colnames(subset_matrix)
# Reorder gene expression matrix
ordered_samples <- order(sample_info$Dose)
subset_matrix <- subset_matrix[, ordered_samples]
sample_info <- sample_info[ordered_samples, ]
# Create the heatmap
heatmap <- pheatmap(subset_matrix,
scale = "row",
cluster_rows = FALSE,
cluster_cols = FALSE,
color = viridis(50),
show_colnames = FALSE,
annotation_col = annotation_df,
fontsize = font_size,
silent = FALSE)
}These are the first experiments where I use version 2 chemistry. Read 2 is now the barcode and UMI. Read 1 is the cDNA. This was done to avoid reading through the TSO on Ilumina read 1
This sequencing run consists of 3 experiments:
Test version 2 protocol with a biological application from Zac Moore of Brain Cancer lab https://au-mynotebook.labarchives.com/share/Piper_Research_Project/MzEuMjAwMDAwMDAwMDAwMDAzfDE3MTc2NS8yNC9UcmVlTm9kZS8zMzczNDM0NjE1fDc5LjE5OTk5OTk5OTk5OTk5
Differential expression testing. Focus on the genes that vary between doses.
This object was generated in 2A_Clustering.
Subset only day 3 timepoint for simplicity
sce <- readRDS(here::here(
"data/TIRE_brain_human/SCEs/", "brainCancer_subset.sce.rds"
))
dge <- scran::convertTo(sce, type="edgeR")
day <- 3
dge <- dge[,dge$samples$Day_Exposure %in% day]
tb <- as_tibble(dge$samples)Convert the numerical dose column to a factor. For simplicity sake keep the doses with rounded concentrations.
tb$Dose <- as.factor(tb$Dose_M)
tb$Dose <- recode(tb$Dose,
"0" = "DMSO",
"1e-04" = "100uM",
"1e-07" = "100nM",
"1e-06" = "1uM",
"1e-05" = "10uM")
dge$samples$Dose <- tb$Dose
dge <- dge[,dge$samples$Dose %in% c("DMSO", "10uM", "100uM", "1uM", "100nM")]
dge$samples$Dose <- droplevels(dge$samples$Dose)Have a look at the important metadata.
Remove genes that are lowly expressed. 12,000 genes are kept
## Mode FALSE TRUE
## logical 12599 11653
Correct for composition biases by computing normalization factors with the trimmed mean of M-values method.
Model matrix generation.
## DoseDMSO Dose100nM Dose1uM Dose10uM Dose100uM
## AACGTCAGCC 0 1 0 0 0
## ACAGTAGCTC 0 0 0 0 1
## ATCAAGAACG 0 0 0 1 0
## CAATTGTTCG 0 1 0 0 0
## CATGTACTTG 0 1 0 0 0
## CCAAGACGTG 0 0 0 1 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01203 0.01421 0.02015 0.02395 0.03243 0.04630
Create the contrast matrix. Foucs on the largest changes.
contr.matrix <- makeContrasts(
Dose100uM = Dose100uM - DoseDMSO,
Dose10uM = Dose10uM - DoseDMSO,
Dose1uM = Dose1uM - DoseDMSO,
Dose100nM = Dose100nM - DoseDMSO,
Dose100uM_vs_10uM = Dose100uM - Dose10uM,
levels = colnames(design))In each contrast, the format is A - B where:
Fit BCV
par(mfrow=c(1,2))
v <- voom(dge, design, plot=TRUE)
vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model: Mean-variance trend")hundred_uM <- topTable(efit, coef=1, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(hundred_uM)
results$ID <- rownames(hundred_uM)
# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "TMZ"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "DMSO"
results_hundred <- resultsAdd gene labels
results$genelabels <- ""
results$genelabels[results$Symbol == "GDF15"] <- "GDF15"
results$genelabels[results$Symbol == "FAS"] <- "FAS"
results$genelabels[results$Symbol == "IGFBP5"] <- "IGFBP5"
results$genelabels[results$Symbol == "NEAT1"] <- "NEAT1"
results$genelabels[results$Symbol == "CDKN1A"] <- "CDKN1A"
results$genelabels[results$Symbol == "PDGFRA"] <- "PDGFRA"
results$genelabels[results$Symbol == "HMGCS1"] <- "HMGCS1"
results$genelabels[results$Symbol == "SCN1A"] <- "SCN1A"results$DElabel <- factor(results$DElabel, levels = c("TMZ", "n/s", "DMSO")) # Adjust as per your actual labels
plt1 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) +
geom_point(alpha = 0.33, size = 1.5) +
geom_text_repel(size = 4, colour = "black") +
guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
scale_color_manual(
values = c("darkorange", "grey", "darkblue"), # Colors in the desired order
name = "Upregulated",
labels = c("TMZ", "n/s", "DMSO") # Optional: Add custom labels
) +
geom_vline(xintercept = 1, linetype = "dotted") +
geom_vline(xintercept = -1, linetype = "dotted") +
theme_Publication()
plt1dge$logCPM <- edgeR::cpm(dge, log=TRUE, prior.count=1)
dge_subset <- dge[,dge$samples$Dose == "DMSO" | dge$samples$Dose == "100uM"]
create_gene_heatmap(de_results_df = results, dge_object = dge_subset, font_size = 12, num_genes=10)Need to convert geneIDs from ensembl to enterez
geneids <- as.data.frame(v$genes$ID)
colnames(geneids) <- "ENSEMBL"
geneids$entrez <- mapIds(org.Hs.eg.db, keys = geneids$ENSEMBL, keytype = "ENSEMBL", column = "ENTREZID")Nothing surprising here, cell ycle goes down p53 goes up though very significant.
load("data/MSigDB/human_H_v5p2.rdata")
idx <- ids2indices(Hs.H,identifiers = geneids$entrez)
cam.100uM <- camera(v,idx,design,contrast=contr.matrix[,1])
head(cam.100uM,10)Visualize the gene set testing.
par(mfrow=c(1,1))
barcodeplot(efit$t[,1], index=idx$HALLMARK_P53_PATHWAY,
index2 = idx$HALLMARK_E2F_TARGETS)Visualize as a barplot
ten_uM <- topTable(efit, coef=2, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(ten_uM)
results$ID <- rownames(ten_uM)
# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "TMZ"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "DMSO"
results_ten <- resultsAdd gene labels
results$genelabels <- ""
results$genelabels[results$Symbol == "GDF15"] <- "GDF15"
results$genelabels[results$Symbol == "FAS"] <- "FAS"
results$genelabels[results$Symbol == "CDKN1A"] <- "CDKN1A"
results$genelabels[results$Symbol == "PDGFRA"] <- "PDGFRA"
results$genelabels[results$Symbol == "FADS2"] <- "FADS2"
results$genelabels[results$Symbol == "MDM2"] <- "MDM2"
results$genelabels[results$Symbol == "AL365181.3"] <- "AL365181.3"results$DElabel <- factor(results$DElabel, levels = c("TMZ", "n/s", "DMSO")) # Adjust as per your actual labels
plt2 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) +
geom_point(alpha = 0.33, size = 1.5) +
geom_text_repel(size = 4, colour = "black") +
guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
scale_color_manual(
values = c("darkorange", "grey", "darkblue"), # Colors in the desired order
name = "Upregulated",
labels = c("TMZ", "n/s", "DMSO") # Optional: Add custom labels
) +
geom_vline(xintercept = 1, linetype = "dotted") +
geom_vline(xintercept = -1, linetype = "dotted") +
theme_Publication()
plt2With the spline analysis of dose in notebook 3A_dose_spline there is
a maximum gene expression effect at 10uM that declines at 100uM.
My interpretation is that cells are adapting to TMZ up to the 10uM dose
then most are being killed with only the survivors being alive and
available to have their transcriptome sequenced at 100uM dose.
So this comparison covers the small set of surviving cells at 100uM vs the majority of adapted cells at 10uM.
hundred_v_ten <- topTable(efit, coef=5, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(hundred_v_ten)
results$ID <- rownames(hundred_v_ten)
# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "100uM"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "10uM"Add gene labels
results$genelabels <- ""
results$genelabels[results$Symbol == "MDM2"] <- "MDM2"
results$genelabels[results$Symbol == "E2F7"] <- "E2F7"
results$genelabels[results$Symbol == "DDIT3"] <- "DDIT3"
results$genelabels[results$Symbol == "PTP4A1"] <- "PTP4A1"Not many differentially expressed genes here. Likely the gene expression is pretty subtle.
# Update the plot
plt2 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) +
geom_point(alpha = 1, size = 1.5) +
geom_text_repel(size = 4, colour = "black") +
guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
scale_color_manual(
values = c("darkblue", "grey"),
name = "Upregulated"
) +
geom_vline(xintercept = 1, linetype = "dotted") +
geom_vline(xintercept = -1, linetype = "dotted") +
theme_Publication()
plt2Clearly enriched p53 pathway in the 10uM dose (left). On the right 100uM, likely stemness phenotype which is consistent with the literature.
Generate gene set barchart
All the genes are in common.
hundred_v_dmso <- topTable(efit, coef=1, n=length(efit$genes$ID), sort.by = "logFC", p.value = 0.05, lfc=1)
ten_v_dmso <- topTable(efit, coef=2, n=length(efit$genes$ID), sort.by = "logFC", p.value = 0.05, lfc=1)
venn_list <- list(
"100uM vs DMSO" = row.names(hundred_v_dmso),
"10uM vs DMSO" = row.names(ten_v_dmso)
)
ggvenn(venn_list,
show_elements = F, label_sep = "\n",
text_size = 8,
show_percentage = FALSE,
fill_color = c("navy", "springgreen4")
)What are the 8 genes that are DE in 100uM vs DMSO?
# Extract the gene IDs from both data frames
genes_hundred <- hundred_v_dmso$ID
genes_ten <- ten_v_dmso$ID
# Find genes that are in hundred_v_dmso but not in ten_v_dmso
unique_genes <- setdiff(genes_hundred, genes_ten)
# To get the rows from hundred_v_dmso that are not in ten_v_dmso
hundred_v_dmso[hundred_v_dmso$ID %in% unique_genes, ]$Symbol## [1] "RNF26" "MELK" "HIST1H1E" "TMEM97" "CDCA7" "KIAA1549L"
## [7] "TUBA1B" "SRSF1"
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Red Hat Enterprise Linux 9.5 (Plow)
##
## Matrix products: default
## BLAS: /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRblas.so
## LAPACK: /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRlapack.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggthemes_5.1.0 here_1.0.1
## [3] patchwork_1.3.0 ggvenn_0.1.10
## [5] viridis_0.6.5 viridisLite_0.4.2
## [7] pheatmap_1.0.12 ggrepel_0.9.6
## [9] scater_1.32.1 org.Hs.eg.db_3.19.1
## [11] AnnotationDbi_1.66.0 RColorBrewer_1.1-3
## [13] edgeR_4.2.2 limma_3.60.6
## [15] biomaRt_2.60.1 scuttle_1.14.0
## [17] lubridate_1.9.3 forcats_1.0.0
## [19] stringr_1.5.1 dplyr_1.1.4
## [21] purrr_1.0.2 readr_2.1.5
## [23] tidyr_1.3.1 tibble_3.2.1
## [25] ggplot2_3.5.1 tidyverse_2.0.0
## [27] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
## [29] Biobase_2.64.0 GenomicRanges_1.56.2
## [31] GenomeInfoDb_1.40.1 IRanges_2.38.1
## [33] S4Vectors_0.42.1 BiocGenerics_0.50.0
## [35] MatrixGenerics_1.16.0 matrixStats_1.4.1
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.17.0 jsonlite_1.8.9
## [3] magrittr_2.0.3 ggbeeswarm_0.7.2
## [5] farver_2.1.2 rmarkdown_2.28
## [7] zlibbioc_1.50.0 vctrs_0.6.5
## [9] memoise_2.0.1 DelayedMatrixStats_1.26.0
## [11] htmltools_0.5.8.1 S4Arrays_1.4.1
## [13] progress_1.2.3 curl_5.2.3
## [15] BiocNeighbors_1.22.0 SparseArray_1.4.8
## [17] sass_0.4.9 bslib_0.8.0
## [19] httr2_1.0.5 cachem_1.1.0
## [21] igraph_2.0.3 lifecycle_1.0.4
## [23] pkgconfig_2.0.3 rsvd_1.0.5
## [25] Matrix_1.7-0 R6_2.5.1
## [27] fastmap_1.2.0 GenomeInfoDbData_1.2.12
## [29] digest_0.6.37 colorspace_2.1-1
## [31] rprojroot_2.0.4 dqrng_0.4.1
## [33] irlba_2.3.5.1 RSQLite_2.3.7
## [35] beachmat_2.20.0 labeling_0.4.3
## [37] filelock_1.0.3 fansi_1.0.6
## [39] timechange_0.3.0 httr_1.4.7
## [41] abind_1.4-8 compiler_4.4.1
## [43] bit64_4.5.2 withr_3.0.1
## [45] BiocParallel_1.38.0 DBI_1.2.3
## [47] highr_0.11 rappdirs_0.3.3
## [49] DelayedArray_0.30.1 bluster_1.14.0
## [51] tools_4.4.1 vipor_0.4.7
## [53] beeswarm_0.4.0 glue_1.8.0
## [55] cluster_2.1.6 generics_0.1.3
## [57] gtable_0.3.5 tzdb_0.4.0
## [59] hms_1.1.3 metapod_1.12.0
## [61] BiocSingular_1.20.0 ScaledMatrix_1.12.0
## [63] xml2_1.3.6 utf8_1.2.4
## [65] XVector_0.44.0 pillar_1.9.0
## [67] splines_4.4.1 BiocFileCache_2.12.0
## [69] lattice_0.22-6 bit_4.5.0
## [71] tidyselect_1.2.1 locfit_1.5-9.10
## [73] Biostrings_2.72.1 knitr_1.48
## [75] gridExtra_2.3 xfun_0.48
## [77] statmod_1.5.0 stringi_1.8.4
## [79] UCSC.utils_1.0.0 yaml_2.3.10
## [81] evaluate_1.0.1 codetools_0.2-20
## [83] cli_3.6.3 munsell_0.5.1
## [85] jquerylib_0.1.4 Rcpp_1.0.13
## [87] dbplyr_2.5.0 png_0.1-8
## [89] parallel_4.4.1 blob_1.2.4
## [91] prettyunits_1.2.0 scran_1.32.0
## [93] sparseMatrixStats_1.16.0 scales_1.3.0
## [95] crayon_1.5.3 rlang_1.1.4
## [97] KEGGREST_1.44.1